using CSV
using DataFrames
using Turing
using Distributions
using MCMCChains, Plots, StatsPlots
using Random:seed! # to set seed
using Statistics
using StatsFuns: logistic
using LazyArrays
using Suppressor
seed!(1)
MersenneTwister(1)

Question 1

The posterior distribution for difference in mean blood percentages between those clamped early (procedure 1) and those not clamped until the placenta began to descend (procedure 2) has mean -2.45859 (95% HDI: -3.94466, -0.944554). The 95% HDI contains 0, supporting that the mean blood percentage for procedure 1 is smaller than for procedure 2.

@model function equality(x1, x2)
    alpha1 ~ Gamma(0.001, 1/0.001)
    alpha2 ~ Gamma(0.001, 1/0.001)
    beta1 ~ Gamma(0.001, 1/0.001)
    beta2 ~ Gamma(0.001, 1/0.001)
    x1 ~ Gamma(alpha1, 1/beta1)
    x2 ~ Gamma(alpha2, 1/beta2)
    mu1 = alpha1/beta1
    mu2 = alpha2/beta2
    mudiff = mu1-mu2
    return mudiff
    
end
equality (generic function with 2 methods)
q1 = DataFrame(CSV.File("babies.csv"))
# describe(q1)
model = equality(q1.x, q1.y)
chain = sample(model,  NUTS(), MCMCThreads(), 10_000, 4);
gelmandiag(chain)
Gelman, Rubin, and Brooks diagnostic
  parameters      psrf    psrfci 
      Symbol   Float64   Float64 

      alpha1    1.0000    1.0001
      alpha2    1.0003    1.0006
       beta1    1.0000    1.0002
       beta2    1.0003    1.0005
plot(chain)
# Extract mudiff
mudiff = generated_quantities(model, chain)
mudiff = mudiff[:]
df = DataFrame(:mean => mean(mudiff), :lb =>hdi(mudiff, 0.95)[1], :ub =>hdi(mudiff, 0.95)[2])

1 rows × 3 columns

meanlbub
Float64Float64Float64
1-2.45859-3.94466-0.944554

Question 2

The probability that a female wolf with measures x3 = 5.28 and x7 = 1.78, comes from Arctic habitat is 0.5028666905953311.

q2 = DataFrame(CSV.File("wolves.csv"))
describe(q2)

4 rows × 7 columns

variablemeanminmedianmaxnmissingeltype
SymbolFloat64RealFloat64RealInt64DataType
1arctic0.6401.010Int64
2gender0.3600.010Int64
3x35.51724.925.555.980Float64
4x71.82721.61.832.070Float64
@model function logisticreg(X, y; predictors=size(X, 2))
    α ~ Normal(0, 10)
    β ~ filldist(Normal(0, 100), predictors)

    y ~ arraydist(LazyArray(@~ BernoulliLogit.(α .+ X * β)))
end

X = Matrix(q2[:, Not(:arctic)])
y = q2[:, :arctic]

model = logisticreg(X, y)
chain = sample(model, NUTS(), MCMCThreads(), 10_000, 4)
┌ Warning: Only a single thread available: MCMC chains are not sampled in parallel
└ @ AbstractMCMC /Users/aaron/.julia/packages/AbstractMCMC/BPJCW/src/sample.jl:291
┌ Info: Found initial step size
│   ϵ = 3.2
└ @ Turing.Inference /Users/aaron/.julia/packages/Turing/y0DW3/src/inference/hmc.jl:188
┌ Info: Found initial step size
│   ϵ = 1.6
└ @ Turing.Inference /Users/aaron/.julia/packages/Turing/y0DW3/src/inference/hmc.jl:188
┌ Info: Found initial step size
│   ϵ = 1.6
└ @ Turing.Inference /Users/aaron/.julia/packages/Turing/y0DW3/src/inference/hmc.jl:188
┌ Info: Found initial step size
│   ϵ = 3.2
└ @ Turing.Inference /Users/aaron/.julia/packages/Turing/y0DW3/src/inference/hmc.jl:188
Sampling (1 threads): 100%|█████████████████████████████| Time: 0:00:21
Chains MCMC chain (10000×16×4 Array{Float64, 3}):

Iterations        = 1001:1:11000
Number of chains  = 4
Samples per chain = 10000
Wall duration     = 30.68 seconds
Compute duration  = 30.25 seconds
parameters        = α, β[2], β[3], β[1]
internals         = lp, n_steps, is_accept, acceptance_rate, log_density, hamiltonian_energy, hamiltonian_energy_error, max_hamiltonian_energy_error, tree_depth, numerical_error, step_size, nom_step_size

Summary Statistics
  parameters       mean       std   naive_se      mcse          ess      rhat  ⋯
      Symbol    Float64   Float64    Float64   Float64      Float64   Float64  ⋯

           α    -5.1443    8.0897     0.0404    0.0633   17253.5221    1.0003  ⋯
        β[1]     1.4155    1.2263     0.0061    0.0096   16348.7635    1.0001  ⋯
        β[2]     7.2965    3.2211     0.0161    0.0298   11670.0967    1.0001  ⋯
        β[3]   -19.0043    8.2255     0.0411    0.0716   13388.9599    1.0001  ⋯
                                                                1 column omitted

Quantiles
  parameters       2.5%      25.0%      50.0%      75.0%     97.5% 
      Symbol    Float64    Float64    Float64    Float64   Float64 

           α   -20.9535   -10.6301    -5.2135     0.2916   10.9538
        β[1]    -0.9148     0.5814     1.3784     2.2170    3.9143
        β[2]     1.4642     5.0667     7.1312     9.3272   14.1133
        β[3]   -36.9478   -24.0291   -18.3973   -13.2610   -4.5984
plot(chain)
function prediction(x::Matrix, chain)
    summaries, quantiles = describe(chain)
    coefs = summaries[:, :mean]

    # Retrieve the number of rows.
    n, _ = size(x)

    # Generate a vector to store our predictions.
    v = Vector{Float64}(undef, n)

    # Calculate the logistic function for each element in the test set.
    for i in 1:n
        
        num = logistic(coefs[1] + sum(x_pred[i,:].*coefs[2:length(coefs)]))
        v[i] = num
        
    end
    return v
end;

x_pred = Matrix(reshape([1, 5.28, 1.78], 1,3))

# Make the predictions.
predictions = prediction(x_pred, chain)
1-element Vector{Float64}:
 0.7249557664695061

Question 3

a. Poisson model for the problem:

I set an uninformative prior for all coefficient as Normal distribution N(0, 10)

q3 = DataFrame(CSV.File("micronuclei.csv"))
describe(q3)

2 rows × 7 columns

variablemeanminmedianmaxnmissingeltype
SymbolFloat64RealFloat64RealInt64DataType
1x1.750.01.54.00Float64
2y0.30300.060Int64
@model function poissonreg(x, y, σ²; n= size(x,1))
    β ~ filldist(Normal(0, σ²), 2)
    
    for i = 1:n
        y[i] ~ LogPoisson(β[1] + β[2]*x[i])
    end
end

model = poissonreg(q3.x, q3.y, 10)
chain = sample(model, NUTS(), MCMCThreads(), 6_000, 4)
Chains MCMC chain (6000×14×4 Array{Float64, 3}):

Iterations        = 1001:1:7000
Number of chains  = 4
Samples per chain = 6000
Wall duration     = 64.92 seconds
Compute duration  = 64.5 seconds
parameters        = β[1], β[2]
internals         = lp, n_steps, is_accept, acceptance_rate, log_density, hamiltonian_energy, hamiltonian_energy_error, max_hamiltonian_energy_error, tree_depth, numerical_error, step_size, nom_step_size

Summary Statistics
  parameters      mean       std   naive_se      mcse         ess      rhat    ⋯
      Symbol   Float64   Float64    Float64   Float64     Float64   Float64    ⋯

        β[1]   -2.8137    0.0633     0.0004    0.0009   5426.4735    1.0004    ⋯
        β[2]    0.6709    0.0196     0.0001    0.0003   5428.6900    1.0004    ⋯
                                                                1 column omitted

Quantiles
  parameters      2.5%     25.0%     50.0%     75.0%     97.5% 
      Symbol   Float64   Float64   Float64   Float64   Float64 

        β[1]   -2.9386   -2.8563   -2.8134   -2.7706   -2.6915
        β[2]    0.6333    0.6575    0.6708    0.6841    0.7096
# Gelman Diagnostics: PSRF coefficients are close to 1, indicating good fit
gelmandiag(chain)
Gelman, Rubin, and Brooks diagnostic
  parameters      psrf    psrfci 
      Symbol   Float64   Float64 

        β[1]    1.0005    1.0010
        β[2]    1.0006    1.0013
plot(chain)

b. The average number of micronuclei fpr dose of 3.5 Gy is 0.6277056540217703

function poission_prediction(x::Matrix, chain)
    summaries, quantiles = describe(chain)
    coefs = summaries[:, :mean]

    # Retrieve the number of rows.
    n, _ = size(x)

    # Generate a vector to store our predictions.
    v = Vector{Float64}(undef, n)

    # Calculate the logistic function for each element in the test set.
    for i in 1:n
        
        num = exp(sum(coefs[1] + coefs[2]*x[i]))
        v[i] = num
        
    end
    return v
end;

x_pred = Matrix(reshape([3.5], 1, 1))

# Make the predictions.
predictions = poission_prediction(x_pred, chain)
1-element Vector{Float64}:
 0.6277056540217703